#Load packages

library(data.table)
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(lmtest)
library(maps)
library(mapdata)
library(readxl)

#Add data GPS

gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")

head(country_data)

Clean the data from missing values

# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
  drop_na(country, isocode, risktaking, gender, age)

# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))

# Display the cleaned data
cleaned_data

# Display the number of records removed per variable
records_removed_per_variable
         country          isocode             ison           region         language             date 
               0                0                0                0                0                0 
       id_gallup              wgt         patience       risktaking         posrecip         negrecip 
               0                0              190              634               32              247 
        altruism            trust subj_math_skills           gender              age 
              74              163              132                0              276 

List of all variable

# List all variables
variable_list <- names(cleaned_data) 

# Display the list of variables
print(variable_list)
 [1] "country"          "isocode"          "ison"             "region"           "language"        
 [6] "date"             "id_gallup"        "wgt"              "patience"         "risktaking"      
[11] "posrecip"         "negrecip"         "altruism"         "trust"            "subj_math_skills"
[16] "gender"           "age"             

#select only the variables of interest

cleaned_data <- cleaned_data %>%
  select(country, isocode, ison, risktaking, gender, age)
cleaned_data

Calculate Z-Score for Age per Country

cleaned_data <- cleaned_data %>%
  group_by(country) %>%
  mutate(z_score_age = scale(age))

# Display the new column with Z-Scores per Country
head(cleaned_data)

Create a new column “age_group” with the age categories

cleaned_data$agecat <- cut(
  cleaned_data$age,
  breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
  labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
  right = FALSE # Left end (inclusive), right end (exclusive)
)

# Display the new column
head(cleaned_data)

Calculate the the mean and add column

# Calculate mean values for risktaking, gender, and age per country
country_means <- cleaned_data %>%
  group_by(country) %>%
  summarize(
    mean_risktaking = mean(risktaking, na.rm = TRUE),
    mean_gender = mean(gender, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE), 
    mean_z_score_age = mean(z_score_age, na.rm = TRUE),
  )

# Merge mean values back to the original dataset
cleaned_data <- merge(cleaned_data, country_means, by = "country", all.x = TRUE)

# Display the updated dataset
head(cleaned_data)
NA

#Hardship-list

excel_path <- "/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/Hardship/Hardship.xlsx"

hardship_values <- read_excel(excel_path) # Read data from the Excel file
New names:
print(colnames(cleaned_data)) # Display column names in cleaned_data
 [1] "country"          "isocode"          "ison"             "risktaking"       "gender"          
 [6] "age"              "z_score_age"      "agecat"           "mean_risktaking"  "mean_gender"     
[11] "mean_age"         "mean_z_score_age"
print(colnames(hardship_values)) # Display column names in hardship_data
[1] "country"          "mean_homicide"    "gdp"              "gini"             "Infant_mortality"
[6] "life_expect"      "hardship"         "...8"            
cleaned_data <- left_join(cleaned_data, hardship_values, by = "country") # Perform the left_join operation

head(cleaned_data) # Display the updated dataset

#select only the variables of interest

gps_country <- cleaned_data %>%
  select(country, isocode, ison, mean_risktaking, mean_gender, mean_age, mean_z_score_age, hardship)
gps_country

Calculate mean values for risktaking, gender, and age per country

#select only the variables of interest
gps_country <- cleaned_data %>%
  group_by(country) %>%
  summarize(
    isocode = first(isocode),
    ison = first(ison),
    mean_risktaking = mean(risktaking, na.rm = TRUE),
    mean_gender = mean(gender, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    hardship = first(hardship)
  ) %>%
  ungroup()

gps_country

Descriptive

Number of courtries after data cleaning

# Determine the number of different countries
number_of_countries <- length(unique(cleaned_data$country))

# Display the number of different countries
number_of_countries
[1] 76

#CODE FOR THE PRESENTATION ########### ########## ##########

Worldmap of the recorded countries

world_map <- map_data("world") # Create a world map with country borders

recorded_countries <- unique(cleaned_data$country) # Get the list of recorded countries from your cleaned_data

world_map$recorded <- ifelse(world_map$region %in% recorded_countries, "Recorded", "Not Recorded") # Create a new variable indicating whether a country has been recorded or not

# Plot the world map with recorded countries highlighted
ggplot(world_map, aes(x = long, y = lat, group = group, fill = recorded)) +
  geom_polygon(color = "white") +
  scale_fill_manual(values = c("Recorded" = "darkblue", "Not Recorded" = "lightgrey"), guide = "none") +  # Set guide to "none" to remove the legend
  theme_void() +
  labs(title = "GPS", fill = "Status") +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))  # Remove legend and center the title

All about the age

# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)

# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)

# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)

# Display the age statistics
cat("Age Range: ", age_min, " to ", age_max, "\n")
Age Range:  15  to  99 
cat("Average Age: ", average_age, "\n")
Average Age:  41.73605 
cat("Median Age: ", median_age, "\n")
Median Age:  40 

Number of participants in each age category

# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)

# Display the number of participants in the age categories
print(agecat_counts)

15-19 20-29 30-39 40-49 50-59 60-69 70-79   80+ 
 6888 16872 15905 13583 11374  8570  4688  1559 

Boxplot of Age Distribution per Country

ggplot(cleaned_data, aes(x = age)) +
  geom_histogram(binwidth = 0.5, fill = "lightblue", color = "blue") +
  labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
  theme_minimal()


ggplot(cleaned_data, aes(x = country, y = age)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Distribution of Age per Country", x = "Country", y = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))


ggplot(cleaned_data, aes(x = country, y = risktaking)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Distribution of Age per Country", x = "Country", y = "Risktaking") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

All about the gender

# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)

# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
Number of Females:  36024 
cat("Number of Males: ", gender_counts[2], "\n")
Number of Males:  43415 

Table with country and gender

# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)

# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)

# Calculate the total participants by summing the rows
total_participants <- rowSums(gender_by_country)

# Create a data frame with country, total participants, female, and male
result_table <- data.frame(
  country = rownames(gender_by_country),
  total_participants = total_participants,
  female = gender_by_country[, "Female"],
  male = gender_by_country[, "Male"]
)

# Display the result table
result_table

Analysis of item risktaking

# Summary statistics for risktaking
risk_summary <- summary(cleaned_data$risktaking)

# Histogram of risktaking
ggplot(cleaned_data, aes(x = risktaking)) +
  geom_histogram(binwidth = 1, fill = "lightblue", color = "blue", alpha = 0.7) +
  labs(title = "Distribution of Risk-Taking", x = "Risk-Taking Score", y = "Frequency") +
  theme_minimal()


# Boxplot of risktaking by gender
ggplot(cleaned_data, aes(x = as.factor(gender), y = risktaking, fill = as.factor(gender))) +
  geom_boxplot() +
  labs(title = "Risk-Taking by Gender", x = "Gender", y = "Risk-Taking Score") +
  scale_x_discrete(labels = c("0" = "male", "1" = "female")) +
  theme_minimal() +
  guides(fill = FALSE)


# Risk vs age
ggplot(cleaned_data, aes(risktaking, age)) +
  geom_point(size = 0.2) +  
  geom_smooth(method = "lm")


# Boxplot of risktaking by age group
cleaned_data$age_group <- cut(cleaned_data$age, breaks = c(0, 25, 35, 45, 55, Inf),
                               labels = c("18-25", "26-35", "36-45", "46-55", "56+"))
ggplot(cleaned_data, aes(x = age_group, y = risktaking, fill = age_group)) +
  geom_boxplot() +
  labs(title = "Risk-Taking by Age Group", x = "Age Group", y = "Risk-Taking Score") +
  theme_minimal() +
  guides(fill = FALSE)


# Boxplot of risktaking by country
ggplot(cleaned_data, aes(x = country, y = risktaking)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Risk-Taking by Country", x = "Country", y = "Risk-Taking Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

FOR PRESENTATION #############################

Risk vs age with color-coded gender per Country

# Skalierung des Z-Scores für das Alter anpassen
cleaned_data$z_score_age_scaled <- 15 * cleaned_data$z_score_age + 42

# Risk vs age with color-coded gender per Country
ggplot(cleaned_data, aes(z_score_age_scaled, risktaking, color = factor(gender))) +
  geom_point(size = 0.1) +  
  geom_smooth(method = "lm") +
  geom_vline(xintercept = 42, linetype = "dashed", color = "black", size = 1) +  # Vertikale Linie für den Mittelwert
  scale_color_manual(values = c("blue", "red"), labels = c("Male", "Female")) +
  labs(color = "Gender") +
  ggtitle("Risk vs Age GPS") +
  xlab("Age (Mean = 42, SD = 15)") +
  scale_x_continuous(breaks = seq(0, 100, by = 15), limits = c(0, 100)) +  # Anpassung der Intervalle auf der X-Achse
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Risktaking vs. hardship across countries

# Risktaking vs. Hardship across countries with summarized points
ggplot(cleaned_data, aes(x = hardship, y = mean_risktaking)) +
  geom_point(aes(color = isocode), size = 3) +
  geom_text(aes(label = isocode), vjust = -0.5, hjust = 0.5, size = 3) +
  labs(title = "Risktaking vs. Hardship across Countries GPS",
       x = "Hardship", y = "Risktaking") +
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

# Risktaking vs. Hardship across countries with summarized points
ggplot(cleaned_data, aes(x = hardship, y = mean_z_score_age)) +
  geom_point(aes(color = isocode), size = 3) +
  geom_text(aes(label = isocode), vjust = -0.5, hjust = 0.5, size = 3) +
  labs(title = "Risktaking vs. Age across Countries GPS",
       x = "Hardship", y = "Age") +
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))

Scatterplot of the relationship between hardship index and age-effect estimates

model_age <- lm(hardship ~ mean_z_score_age, data = cleaned_data)
model_gender <- lm(hardship ~ gender, data = cleaned_data)
# Coefficients for age-effect
age_coefficients <- coef(model_age)

# Coefficients for gender-effect
gender_coefficients <- coef(model_gender)
# Skalierung des Z-Scores für das Alter anpassen
cleaned_data$z_score_age_scaled <- 15 * cleaned_data$mean_z_score_age + 42

# Scatterplot for age-effect
ggplot(cleaned_data, aes(x = mean_z_score_age, y = hardship)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
  labs(title = "Scatterplot of Hardship vs Age",
       x = "Age", y = "Hardship") +
  theme_minimal()

Laura has to rewrite the code –> Table already included with the lm

# Fit das Modell mit "age"
model_gender <- lm(risktaking ~ age + gender, data = cleaned_data)

# Intercept and Slope for age
intercept_age <- coef(model)["(Intercept)"]
slope_age <- coef(model)["age"]

summary(model)

Call:
lm(formula = risktaking ~ age * gender, data = cleaned_data)

Residuals:
    Min      1Q  Median      3Q     Max 
-2.3288 -0.6770 -0.0426  0.6446  3.0662 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.6413392  0.0131258  48.861  < 2e-16 ***
age         -0.0124875  0.0002925 -42.692  < 2e-16 ***
gender      -0.1285097  0.0178148  -7.214 5.50e-13 ***
age:gender  -0.0021551  0.0003944  -5.464 4.66e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9644 on 79435 degrees of freedom
Multiple R-squared:  0.07001,   Adjusted R-squared:  0.06998 
F-statistic:  1993 on 3 and 79435 DF,  p-value: < 2.2e-16

Laura has to rewrite the code –> Table already included with the lm

Create a table with the following information: country, isocode, n (count of participants), female percentage (%), mean age, age range, and risktaking

Four loup

Group by country mutate intercept run and save intercept model

# Group the data by country
table_data <- gps_data %>%
  group_by(country, isocode) %>%
  summarize(
    n = n(),
    female_percentage = mean(gender == 1) * 100,
    mean_age = mean(age, na.rm = TRUE),
    age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
    mean_risktaking = mean(risktaking, na.rm = TRUE)
  )
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
# Intercept and slope for age
intercept_age <- 0.5759701
slope_age <- -0.0137902

# Intercept and slope for gender
intercept_gender <- 0.124666 
slope_gender <- -0.227338 

# Add the intercept and slope information to the table
table_data <- table_data %>%
  mutate(
    intercept_age = intercept_age,
    slope_age = slope_age,
    intercept_gender = intercept_gender,
    slope_gender = slope_gender,
  )

# Display the updated table
table_data

#Preparing for linear regression

# Check for missing values in 'Country' and 'Risktaking' columns
missing_country <- anyNA(cleaned_data$country)
missing_risktaking <- anyNA(cleaned_data$risktaking)

# Print the results
cat("Missing values in 'Country': ", missing_country, "\n")
Missing values in 'Country':  FALSE 
cat("Missing values in 'Risktaking': ", missing_risktaking, "\n")
Missing values in 'Risktaking':  FALSE 
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
  drop_na(country, risktaking, age)

# Split the data by country and perform linear regression for each country
regression_results <- cleaned_data %>%
  group_by(country) %>%
  do(model = lm(risktaking ~ age, data = .)) %>%
  summarize(
    country = first(country),
    intercept = summary(model)$coefficients[1],
    slope = summary(model)$coefficients[2],
    r_squared = summary(model)$r.squared
  )

# Display regression results for each country
print(regression_results)

analyze the results by using “Balkendiagramm”

what does 0 mean? // centered age = mean age of the sample
ggplot(data = regression_results, aes(x = country, y = intercept)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

View the countries with intercept values over 0.75

high_intercept_countries <- subset(regression_results, intercept > 0.75)

# View the countries with intercept values over 0.75
print(high_intercept_countries)

Scatterplots for ‘high_intercept_countries’ contains the top 17 countries

# Create scatterplots with regression lines for countries with intercept > 0.75
plots <- lapply(high_intercept_countries$country, function(country) {
  p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking, color = factor(gender))) +
    geom_point(size = 0.5) +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
         color = "Gender") +
    scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
  print(p)
})


# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}


# Switch back to the original working directory
setwd("..")

print("Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory.")
[1] "Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory."

Scatterplots for ‘high_intercept_countries’ contains the top 17 countries

plots <- lapply(high_intercept_countries$country, function(country) {
  p <- ggplot(subset(cleaned_data, country == country), aes(x = ranking.x, y = risktaking, color = factor(gender))) +
    geom_point(size = 0.5) +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
         color = "Gender") +
    scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
  print(p)
})
Error in `geom_point()`:
! Problem while computing aesthetics.
ℹ Error occurred in the 1st layer.
Caused by error:
! object 'ranking.x' not found
Backtrace:
  1. base::lapply(...)
  2. global FUN(X[[i]], ...)
  4. ggplot2:::print.ggplot(p)
  6. ggplot2:::ggplot_build.ggplot(x)
  7. ggplot2:::by_layer(...)
     ...
 15. l$compute_aesthetics(d, plot)
 16. ggplot2 (local) compute_aesthetics(..., self = self)
 17. ggplot2:::scales_add_defaults(...)
 18. base::lapply(aesthetics[new_aesthetics], eval_tidy, data = data)
 19. rlang (local) FUN(X[[i]], ...)

Create scatterplots with regression lines for individual countries

# Examples for countries
regression_results <- data.frame(
  country = c("Algeria", "Argentina", "Austria"),
  intercept = c(0.92053422, 0.51698822, 0.42606684),
  slope = c(-0.0146641801, -0.0115569623, -0.0108763042),
  r_squared = c(5.232529e-02, 5.638271e-02, 3.539810e-02    )
)

# Create scatterplots with regression lines for each country
plots <- lapply(seq_len(nrow(regression_results)), function(i) {
  country <- regression_results$country[i]
  intercept <- regression_results$intercept[i]
  slope <- regression_results$slope[i]
  r_squared <- regression_results$r_squared[i]
  
  p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(intercept, 2),
                           "Slope:", round(slope, 2),
                           "R-squared:", round(r_squared, 2)))
  print(p)
})


# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")
Warning: The working directory was changed to /Users/laurabazzigher/Documents/GitHub/risk_wvs/code/individual_country_plots inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}


# Switch back to the original working directory
setwd("..")

print("Individual plots are saved in the 'individual_country_plots' directory.")
[1] "Individual plots are saved in the 'individual_country_plots' directory."

Scatterplot with include both individual country regression lines and an overall regression line

# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data)

# Create a scatterplot with separate regression lines for each country
# and an overall regression line
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
  geom_point(size = 0.5) +                      # Adjust point size
  geom_smooth(method = "lm", se = FALSE, size = 0.5) + # Solid line for individual countries
  geom_abline(intercept = coef(overall_lm)[1], slope = coef(overall_lm)[2], 
              color = "black", size = 1) + # Add the overall regression line in red
  labs(title = "Scatterplot with Regression Lines for risktaking and age by Country", 
       x = "Age", y = "Risktaking") +
  theme(legend.position = "bottom",          # Move the legend below the graph
        legend.key.size = unit(0.1, "in"))  # Adjust the size of the legend key

Combined List of all 3 files

Prepare Wave 5

# Load the data
WV5_data <- readRDS("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/F00007944-WV5_Data_R_v20180912.rds") # Data of Wave 5

#rename the variables
WV5_data <- WV5_data_df %>%
  rename(sex = V235, age = V237, country = V2, wave = V1, risktaking = V86)
WV5_data

#select only the variables of interest
WV5_data <- WV5_data %>%
  select(sex, age, country, wave, risktaking)
WV5_data

#decode the country names 
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
WV5_data

Prepare Wave 6

#Read Dataset (Wave 6)
WV6_data <- load("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/WV6_Data_R_v20201117.rdata") 
WV6_data <- WV6_Data_R_v20201117 
print(WV6_data)

#rename variables in Wave 6

WV6_data <- WV6_data %>%
  rename(wave = V1, sex = V240, age = V242,country = V2, risktaking = V76)

#select only the variables of interest
WV6_data <- WV6_data %>%
  select(wave, sex, age, country, sex,risktaking)
WV6_data

#decode daraset (Wave 6)

countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
WV6_data

Prepare GPS dataset (cleaned_data)

# Rename "Country" in "country_lab"
print(colnames(cleaned_data))

#combine the 2 dataset (Wave 6 + Wave 5)

WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data

#number of countries

length(unique(data$country_lab))

Merge the datasets based on common variables

# Find common countries between the two datasets
common_countries <- intersect(data$country_lab, cleaned_data$country)

# Display the common countries
print(common_countries)

Find countries in “data” but not in “cleaned_data”

countries_only_in_data <- setdiff(data$country_lab, cleaned_data$country)

# Find countries in "cleaned_data" but not in "data"
countries_only_in_cleaned_data <- setdiff(cleaned_data$country, data$country_lab)

# Display the results
cat("Countries only in 'data':", countries_only_in_data, "\n")
cat("Countries only in 'cleaned_data':", countries_only_in_cleaned_data, "\n")
---
title: "R Notebook"
output: html_notebook
---

#Load packages
```{r}
library(data.table)
library(tidyr)
library(haven)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(lmtest)
library(maps)
library(mapdata)
library(readxl)
```

#Add data GPS
```{r}
gps_data <- haven::read_dta("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/GPS_dataset_individual_level/individual_new.dta")

head(country_data)
```

# Clean the data from missing values
```{r}
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
  drop_na(country, isocode, risktaking, gender, age)

# Calculate the number of records removed per variable
records_removed_per_variable <- colSums(is.na(gps_data)) - colSums(is.na(cleaned_data))

# Display the cleaned data
cleaned_data

# Display the number of records removed per variable
records_removed_per_variable
```

# List of all variable
```{r}
# List all variables
variable_list <- names(cleaned_data) 

# Display the list of variables
print(variable_list)
```
#select only the variables of interest
```{r}
cleaned_data <- cleaned_data %>%
  select(country, isocode, ison, risktaking, gender, age)
cleaned_data
```

############################# Calculate Z-Score for Age per Country
```{r}
cleaned_data <- cleaned_data %>%
  group_by(country) %>%
  mutate(z_score_age = scale(age))

# Display the new column with Z-Scores per Country
head(cleaned_data)
```


# Create a new column "age_group" with the age categories
```{r}
cleaned_data$agecat <- cut(
  cleaned_data$age,
  breaks = c(15, 20, 30, 40, 50, 60, 70, 80, Inf), # The category boundaries
  labels = c("15-19", "20-29", "30-39", "40-49", "50-59", "60-69", "70-79", "80+"), # The category labels
  right = FALSE # Left end (inclusive), right end (exclusive)
)

# Display the new column
head(cleaned_data)
```
# Calculate the the mean and add column
```{r}
# Calculate mean values for risktaking, gender, and age per country
country_means <- cleaned_data %>%
  group_by(country) %>%
  summarize(
    mean_risktaking = mean(risktaking, na.rm = TRUE),
    mean_gender = mean(gender, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE), 
    mean_z_score_age = mean(z_score_age, na.rm = TRUE),
  )

# Merge mean values back to the original dataset
cleaned_data <- merge(cleaned_data, country_means, by = "country", all.x = TRUE)

# Display the updated dataset
head(cleaned_data)

```

#Hardship-list
```{r}
excel_path <- "/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/Hardship/Hardship.xlsx"

hardship_values <- read_excel(excel_path) # Read data from the Excel file

print(colnames(cleaned_data)) # Display column names in cleaned_data

print(colnames(hardship_values)) # Display column names in hardship_data

cleaned_data <- left_join(cleaned_data, hardship_values, by = "country") # Perform the left_join operation

head(cleaned_data) # Display the updated dataset
```

#select only the variables of interest
```{r}
gps_country <- cleaned_data %>%
  select(country, isocode, ison, mean_risktaking, mean_gender, mean_age, mean_z_score_age, hardship)
gps_country
```

# Calculate mean values for risktaking, gender, and age per country
```{r}
#select only the variables of interest
gps_country <- cleaned_data %>%
  group_by(country) %>%
  summarize(
    isocode = first(isocode),
    ison = first(ison),
    mean_risktaking = mean(risktaking, na.rm = TRUE),
    mean_gender = mean(gender, na.rm = TRUE),
    mean_age = mean(age, na.rm = TRUE),
    hardship = first(hardship)
  ) %>%
  ungroup()

gps_country
```

#######################################################################################################################
#######################################################################################################################
# Descriptive 

# Number of courtries after data cleaning
```{r}
# Determine the number of different countries
number_of_countries <- length(unique(cleaned_data$country))

# Display the number of different countries
number_of_countries
```

###########
##########
##########
#CODE FOR THE PRESENTATION 
###########
##########
##########

# Worldmap of the recorded countries
```{r}
world_map <- map_data("world") # Create a world map with country borders

recorded_countries <- unique(cleaned_data$country) # Get the list of recorded countries from your cleaned_data

world_map$recorded <- ifelse(world_map$region %in% recorded_countries, "Recorded", "Not Recorded") # Create a new variable indicating whether a country has been recorded or not

# Plot the world map with recorded countries highlighted
ggplot(world_map, aes(x = long, y = lat, group = group, fill = recorded)) +
  geom_polygon(color = "white") +
  scale_fill_manual(values = c("Recorded" = "darkblue", "Not Recorded" = "lightgrey"), guide = "none") +  # Set guide to "none" to remove the legend
  theme_void() +
  labs(title = "GPS", fill = "Status") +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))  # Remove legend and center the title

```
#############################################################
#############################################################
# All about the age
```{r}
# Age Range
age_min <- min(cleaned_data$age, na.rm = TRUE)
age_max <- max(cleaned_data$age, na.rm = TRUE)

# Average Age
average_age <- mean(cleaned_data$age, na.rm = TRUE)

# Median Age
median_age <- median(cleaned_data$age, na.rm = TRUE)

# Display the age statistics
cat("Age Range: ", age_min, " to ", age_max, "\n")
cat("Average Age: ", average_age, "\n")
cat("Median Age: ", median_age, "\n")
```

# Number of participants in each age category
```{r}
# number of participants in each age category
agecat_counts <- table(cleaned_data$agecat)

# Display the number of participants in the age categories
print(agecat_counts)
```

# Boxplot of Age Distribution per Country 
```{r}
ggplot(cleaned_data, aes(x = age)) +
  geom_histogram(binwidth = 0.5, fill = "lightblue", color = "blue") +
  labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
  theme_minimal()

ggplot(cleaned_data, aes(x = country, y = age)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Distribution of Age per Country", x = "Country", y = "Age") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

ggplot(cleaned_data, aes(x = country, y = risktaking)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Distribution of Age per Country", x = "Country", y = "Risktaking") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
```


# All about the gender
```{r}
# Calculate the counts of females (gender = 1) and males (gender = 2)
gender_counts <- table(cleaned_data$gender)

# Display the counts
cat("Number of Females: ", gender_counts[1], "\n")
cat("Number of Males: ", gender_counts[2], "\n")
```

# Table with country and gender
```{r}
# Create a table that breaks down the number of participants by country and gender
gender_by_country <- xtabs(~ country + gender, data = cleaned_data)

# Rename columns and rows for better readability
colnames(gender_by_country) <- c("Female", "Male")
rownames(gender_by_country) <- unique(cleaned_data$country)

# Calculate the total participants by summing the rows
total_participants <- rowSums(gender_by_country)

# Create a data frame with country, total participants, female, and male
result_table <- data.frame(
  country = rownames(gender_by_country),
  total_participants = total_participants,
  female = gender_by_country[, "Female"],
  male = gender_by_country[, "Male"]
)

# Display the result table
result_table
```
# Analysis of item risktaking 
```{r}
# Summary statistics for risktaking
risk_summary <- summary(cleaned_data$risktaking)

# Histogram of risktaking
ggplot(cleaned_data, aes(x = risktaking)) +
  geom_histogram(binwidth = 1, fill = "lightblue", color = "blue", alpha = 0.7) +
  labs(title = "Distribution of Risk-Taking", x = "Risk-Taking Score", y = "Frequency") +
  theme_minimal()

# Boxplot of risktaking by gender
ggplot(cleaned_data, aes(x = as.factor(gender), y = risktaking, fill = as.factor(gender))) +
  geom_boxplot() +
  labs(title = "Risk-Taking by Gender", x = "Gender", y = "Risk-Taking Score") +
  scale_x_discrete(labels = c("0" = "male", "1" = "female")) +
  theme_minimal() +
  guides(fill = FALSE)

# Risk vs age
ggplot(cleaned_data, aes(risktaking, age)) +
  geom_point(size = 0.2) +  
  geom_smooth(method = "lm")

# Boxplot of risktaking by age group
cleaned_data$age_group <- cut(cleaned_data$age, breaks = c(0, 25, 35, 45, 55, Inf),
                               labels = c("18-25", "26-35", "36-45", "46-55", "56+"))
ggplot(cleaned_data, aes(x = age_group, y = risktaking, fill = age_group)) +
  geom_boxplot() +
  labs(title = "Risk-Taking by Age Group", x = "Age Group", y = "Risk-Taking Score") +
  theme_minimal() +
  guides(fill = FALSE)

# Boxplot of risktaking by country
ggplot(cleaned_data, aes(x = country, y = risktaking)) +
  geom_boxplot(fill = "lightblue", color = "blue") +
  labs(title = "Risk-Taking by Country", x = "Country", y = "Risk-Taking Score") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
```

###########
###########
###########
FOR PRESENTATION
############################# 


# Risk vs age with color-coded gender per Country
```{r}
# Skalierung des Z-Scores für das Alter anpassen
cleaned_data$z_score_age_scaled <- 15 * cleaned_data$z_score_age + 42

# Risk vs age with color-coded gender per Country
ggplot(cleaned_data, aes(z_score_age_scaled, risktaking, color = factor(gender))) +
  geom_point(size = 0.1) +  
  geom_smooth(method = "lm") +
  geom_vline(xintercept = 42, linetype = "dashed", color = "black", size = 1) +  # Vertikale Linie für den Mittelwert
  scale_color_manual(values = c("blue", "red"), labels = c("Male", "Female")) +
  labs(color = "Gender") +
  ggtitle("Risk vs Age GPS") +
  xlab("Age (Mean = 42, SD = 15)") +
  scale_x_continuous(breaks = seq(0, 100, by = 15), limits = c(0, 100)) +  # Anpassung der Intervalle auf der X-Achse
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))
```


# Risktaking vs. hardship across countries
```{r}
# Risktaking vs. Hardship across countries with summarized points
ggplot(cleaned_data, aes(x = hardship, y = mean_risktaking)) +
  geom_point(aes(color = isocode), size = 3) +
  geom_text(aes(label = isocode), vjust = -0.5, hjust = 0.5, size = 3) +
  labs(title = "Risktaking vs. Hardship across Countries GPS",
       x = "Hardship", y = "Risktaking") +
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
```
```{r}
# Risktaking vs. Hardship across countries with summarized points
ggplot(cleaned_data, aes(x = hardship, y = mean_z_score_age)) +
  geom_point(aes(color = isocode), size = 3) +
  geom_text(aes(label = isocode), vjust = -0.5, hjust = 0.5, size = 3) +
  labs(title = "Risktaking vs. Age across Countries GPS",
       x = "Hardship", y = "Age") +
  theme_minimal() +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5))
```


# Scatterplot of the relationship between hardship index and age-effect estimates
```{r}
model_age <- lm(hardship ~ mean_z_score_age, data = cleaned_data)
model_gender <- lm(hardship ~ gender, data = cleaned_data)

```

```{r}
# Coefficients for age-effect
age_coefficients <- coef(model_age)

# Coefficients for gender-effect
gender_coefficients <- coef(model_gender)

```

```{r}

# Scatterplot for age-effect
ggplot(cleaned_data, aes(x = mean_z_score_age, y = hardship)) +
  geom_point(size = 2) +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
  labs(title = "Scatterplot of Hardship vs Age",
       x = "Age", y = "Hardship") +
  theme_minimal()

```


################################################################################################################################################################################################################################################


###########
###########
###########
Laura has to rewrite the code --> Table already included with the lm

```{r}
# Fit das Modell mit "age"
model_gender <- lm(risktaking ~ age + gender, data = cleaned_data)

# Intercept and Slope for age
intercept_age <- coef(model)["(Intercept)"]
slope_age <- coef(model)["age"]

summary(model)
```

###########
###########
###########
Laura has to rewrite the code --> Table already included with the lm


# Create a table with the following information: country, isocode, n (count of participants), female percentage (%), mean age, age range, and risktaking

### Four loup 
Group by country
mutate intercept 
run and save intercept model 
```{r}
# Group the data by country
table_data <- gps_data %>%
  group_by(country, isocode) %>%
  summarize(
    n = n(),
    female_percentage = mean(gender == 1) * 100,
    mean_age = mean(age, na.rm = TRUE),
    age_range = paste(min(age, na.rm = TRUE), "-", max(age, na.rm = TRUE)),
    mean_risktaking = mean(risktaking, na.rm = TRUE)
  )

# Intercept and slope for age
intercept_age <- 0.5759701
slope_age <- -0.0137902

# Intercept and slope for gender
intercept_gender <- 0.124666 
slope_gender <- -0.227338 

# Add the intercept and slope information to the table
table_data <- table_data %>%
  mutate(
    intercept_age = intercept_age,
    slope_age = slope_age,
    intercept_gender = intercept_gender,
    slope_gender = slope_gender,
  )

# Display the updated table
table_data
```


#############################################
#############################################



#Preparing for linear regression
```{r}
# Check for missing values in 'Country' and 'Risktaking' columns
missing_country <- anyNA(cleaned_data$country)
missing_risktaking <- anyNA(cleaned_data$risktaking)

# Print the results
cat("Missing values in 'Country': ", missing_country, "\n")
cat("Missing values in 'Risktaking': ", missing_risktaking, "\n")

```


```{r}
# Clean the data by removing records with missing values
cleaned_data <- gps_data %>%
  drop_na(country, risktaking, age)

# Split the data by country and perform linear regression for each country
regression_results <- cleaned_data %>%
  group_by(country) %>%
  do(model = lm(risktaking ~ age, data = .)) %>%
  summarize(
    country = first(country),
    intercept = summary(model)$coefficients[1],
    slope = summary(model)$coefficients[2],
    r_squared = summary(model)$r.squared
  )

# Display regression results for each country
print(regression_results)
```


# analyze the results by using "Balkendiagramm"
###### what does 0 mean? // centered age = mean age of the sample
```{r}
ggplot(data = regression_results, aes(x = country, y = intercept)) +
  geom_bar(stat = "identity", fill = "blue") +
  labs(title = "Intercepts for Risktaking by Country", x = "Country", y = "Intercept Value") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
```
# View the countries with intercept values over 0.75
```{r}
high_intercept_countries <- subset(regression_results, intercept > 0.75)

# View the countries with intercept values over 0.75
print(high_intercept_countries)
```


# Scatterplots for 'high_intercept_countries' contains the top 17 countries
```{r}
# Create scatterplots with regression lines for countries with intercept > 0.75
plots <- lapply(high_intercept_countries$country, function(country) {
  p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking, color = factor(gender))) +
    geom_point(size = 0.5) +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
         color = "Gender") +
    scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
  print(p)
})

# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")

for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}

# Switch back to the original working directory
setwd("..")

print("Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory.")

```

# Scatterplots for 'high_intercept_countries' contains the top 17 countries
```{r}
plots <- lapply(high_intercept_countries$country, function(country) {
  p <- ggplot(subset(cleaned_data, country == country), aes(x = ranking.x, y = risktaking, color = factor(gender))) +
    geom_point(size = 0.5) +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE) +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(high_intercept_countries$intercept[high_intercept_countries$country == country], 2)),
         color = "Gender") +
    scale_color_manual(values = c("0" = "blue", "1" = "red"), labels = c("Male", "Female"))
  print(p)
})

# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")

for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}

# Switch back to the original working directory
setwd("..")

print("Individual plots for countries with intercept > 0.75 are saved in the 'individual_country_plots' directory.")
```

# Create scatterplots with regression lines for individual countries
```{r}
# Examples for countries
regression_results <- data.frame(
  country = c("Algeria", "Argentina", "Austria"),
  intercept = c(0.92053422, 0.51698822, 0.42606684),
  slope = c(-0.0146641801, -0.0115569623, -0.0108763042),
  r_squared = c(5.232529e-02, 5.638271e-02, 3.539810e-02	)
)

# Create scatterplots with regression lines for each country
plots <- lapply(seq_len(nrow(regression_results)), function(i) {
  country <- regression_results$country[i]
  intercept <- regression_results$intercept[i]
  slope <- regression_results$slope[i]
  r_squared <- regression_results$r_squared[i]
  
  p <- ggplot(subset(cleaned_data, country == country), aes(x = age, y = risktaking)) +
    geom_point() +
    geom_smooth(method = "lm", formula = y ~ x, se = FALSE, color = "blue") +
    labs(title = paste("Linear Regression for", country),
         subtitle = paste("Intercept:", round(intercept, 2),
                           "Slope:", round(slope, 2),
                           "R-squared:", round(r_squared, 2)))
  print(p)
})

# Save the plots in a directory
dir.create("individual_country_plots", showWarnings = FALSE)
setwd("individual_country_plots")

for (i in seq_along(plots)) {
  filename <- paste0("plot_", i, ".png")
  ggsave(filename, plot = plots[[i]], width = 8, height = 6)
}

# Switch back to the original working directory
setwd("..")

print("Individual plots are saved in the 'individual_country_plots' directory.")
```

# Scatterplot with include both individual country regression lines and an overall regression line
```{r}
# Calculate the overall regression line
overall_lm <- lm(risktaking ~ age, data = cleaned_data)

# Create a scatterplot with separate regression lines for each country
# and an overall regression line
ggplot(cleaned_data, aes(x = age, y = risktaking, color = country)) +
  geom_point(size = 0.5) +                      # Adjust point size
  geom_smooth(method = "lm", se = FALSE, size = 0.5) + # Solid line for individual countries
  geom_abline(intercept = coef(overall_lm)[1], slope = coef(overall_lm)[2], 
              color = "black", size = 1) + # Add the overall regression line in red
  labs(title = "Scatterplot with Regression Lines for risktaking and age by Country", 
       x = "Age", y = "Risktaking") +
  theme(legend.position = "bottom",          # Move the legend below the graph
        legend.key.size = unit(0.1, "in"))  # Adjust the size of the legend key
```



#################################################################################################################################################################
# Combined List of all 3 files

# Prepare Wave 5
```{r}
# Load the data
WV5_data <- readRDS("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/F00007944-WV5_Data_R_v20180912.rds") # Data of Wave 5

#rename the variables
WV5_data <- WV5_data_df %>%
  rename(sex = V235, age = V237, country = V2, wave = V1, risktaking = V86)
WV5_data

#select only the variables of interest
WV5_data <- WV5_data %>%
  select(sex, age, country, wave, risktaking)
WV5_data

#decode the country names 
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)
WV5_data
```

# Prepare Wave 6
```{r}
#Read Dataset (Wave 6)
WV6_data <- load("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/WV6_Data_R_v20201117.rdata") 
WV6_data <- WV6_Data_R_v20201117 
print(WV6_data)
```

#rename variables in Wave 6
```{r}
WV6_data <- WV6_data %>%
  rename(wave = V1, sex = V240, age = V242,country = V2, risktaking = V76)

#select only the variables of interest
WV6_data <- WV6_data %>%
  select(wave, sex, age, country, sex,risktaking)
WV6_data
```
#decode daraset (Wave 6)
```{r}
countrynames = read.csv("/Users/laurabazzigher/Documents/GitHub/risk_wvs/data/dataset/WV6_dataset_wave_5_6/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)
WV6_data
```
# Prepare GPS dataset (cleaned_data)
```{r}
# Rename "Country" in "country_lab"
print(colnames(cleaned_data))
```

#combine the 2 dataset (Wave 6 + Wave 5)
```{r}
WV5_data
WV6_data
data = rbind(WV5_data, WV6_data)
data

#number of countries

length(unique(data$country_lab))
```


# Merge the datasets based on common variables
```{r}
# Find common countries between the two datasets
common_countries <- intersect(data$country_lab, cleaned_data$country)

# Display the common countries
print(common_countries)

```

# Find countries in "data" but not in "cleaned_data"
```{r}
countries_only_in_data <- setdiff(data$country_lab, cleaned_data$country)

# Find countries in "cleaned_data" but not in "data"
countries_only_in_cleaned_data <- setdiff(cleaned_data$country, data$country_lab)

# Display the results
cat("Countries only in 'data':", countries_only_in_data, "\n")
cat("Countries only in 'cleaned_data':", countries_only_in_cleaned_data, "\n")
```



